import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# read in data
accounts_data = pd.read_csv('data/accounts_analytical.csv', low_memory=False)
transactions_data = pd.read_csv('data/transactions.csv', low_memory=False)

# remove accounts without loans
accounts_data = accounts_data[~pd.isna(accounts_data['loan_date'])]

# get the mean balance based on transactions and merge with accounts
mean_bal = (transactions_data.groupby('account_id')['balance']
                                    .mean().round(2)
                                    .reset_index()
                                    .rename(columns = {'balance' : 'avg_bank_balance'}))
accounts_data = accounts_data.merge(mean_bal, how = 'left', on = 'account_id') 

# make loan date
accounts_data['loan_date'] = accounts_data.loan_date.astype('datetime64[D]')
# convert to monthly period
accounts_data['loan_monthly'] = accounts_data.loan_date.dt.to_period('M')
# calculate difference between loan date and account creation date as years
accounts_data['account_loan_diff'] = (accounts_data.loan_date.astype('datetime64[D]') - accounts_data.acct_creation_date.astype('datetime64[D]')).dt.days / 365

# create monthly dataframe for time series plots
accounts_data_monthly = accounts_data[accounts_data['loan_default'] == True].groupby('loan_monthly')['loan_amount'].sum().reset_index()
accounts_data_monthly['loan_date'] = pd.PeriodIndex(accounts_data_monthly.loan_monthly).to_timestamp()
accounts_data_monthly['loan_defaults'] = accounts_data[accounts_data['loan_default'] == True].groupby('loan_monthly').count()['loan_amount'].reset_index().rename(columns = {'loan_amount' : 'loan_defaults'})['loan_defaults']

The first plot answers the question of the relationship between time between account and loan date and the loan defaulting. The violin plot shows that there slight but not significant differences in time between account and loan between good and bad loans.

# set up plot
fig, ax = plt.subplots()
sns.set_style("ticks")
# organize violin plot
sns.violinplot(data=accounts_data,y="account_loan_diff",x='loan_default', palette="muted", ax=ax)
# add labels
## <matplotlib.axes._subplots.AxesSubplot object at 0x00000000DAE15400>
plt.title("Distribution of years between loan and account creation \nsplit by loan default")
## Text(0.5, 1.0, 'Distribution of years between loan and account creation \nsplit by loan default')
plt.ylabel("Time between loan \nand account creation (years)")
## Text(0, 0.5, 'Time between loan \nand account creation (years)')
plt.xlabel("Did the loan default?")
# show the plot
## Text(0.5, 0, 'Did the loan default?')
plt.show()

This next plot shows us when and the size of the loans that defaulted as a scatterplot to see if there are any broad patterns that appear. Given the plot, there is nothing that really sticks out, besides the low defaults in the second half of 1995 and after H2 1998.

# scatter plot for loan date and loan amount colored by default
sns.scatterplot(data=accounts_data, x="loan_date", y="loan_amount", hue="loan_default", alpha=0.6, s = 25)
## <matplotlib.axes._subplots.AxesSubplot object at 0x00000000DAE21820>
plt.title('Scatterplot of loans colored by default status')
## Text(0.5, 1.0, 'Scatterplot of loans colored by default status')
plt.xlabel('Loan open date (month)')
## Text(0.5, 0, 'Loan open date (month)')
plt.ylabel('Loan amount')
## Text(0, 0.5, 'Loan amount')
plt.show()

Since we do not see much in the previous graph, only looking at the number of loan defaults over time might show exposure to adverse broad financial market conditions. We see that there were three broad spikes in defaults for loans started in 1994, 1996, and 1997. This does not show us when the loans went into default, just that they did default.

fig, ax = plt.subplots()
sns.set_style("ticks")
sns.lineplot(data=accounts_data_monthly,x="loan_date", y="loan_defaults", ax=ax)
## <matplotlib.axes._subplots.AxesSubplot object at 0x00000000DC8D0EE0>
plt.title("Monthly time series of loan defaults")
## Text(0.5, 1.0, 'Monthly time series of loan defaults')
plt.ylabel('Number of loan defaults')
## Text(0, 0.5, 'Number of loan defaults')
plt.xlabel('Loan open date (month)')
## Text(0.5, 0, 'Loan open date (month)')
plt.show()

The next plot is another interactive violin plot showing the distribution of monthly loan amount, split by length of loans, and sub-split by loan default. Bad loans are typically have higher loan amounts, and longer loan lengths.

fig = px.violin(accounts_data, y="loan_amount", x="loan_term", color="loan_default", box=True, points="all",
          hover_data=['account_id','avg_bank_balance','loan_date',
       'loan_amount', 'loan_payment', 'loan_term', 'loan_status',
       'loan_default'],
       labels={"loan_payment": "Loan Amount",
               "loan_term": "Length of Loan (months)",
               "loan_default": "Did Loan Default?"
           },
          title="Interactive Distribution of <br>Loan Amount to Loan Default Status")

fig.write_html('temp_plotly1.html')
htmltools::includeHTML("temp_plotly1.html")

The next plot is aother interactive violin plot showing the distribution of monthly loan payment, split by length of loans, and sub-split by loan default. The general trend is that the loans that defaulted usually had a higher average monthly loan payment. This means that loans with higher loan payments are riskier.

import plotly.express as px

fig = px.violin(accounts_data, y="loan_payment", x="loan_term", color="loan_default", box=True, points="all",
          hover_data=['account_id','avg_bank_balance','loan_date',
       'loan_amount', 'loan_payment', 'loan_term', 'loan_status',
       'loan_default'],
       labels={
                     "loan_payment": "Monthly Loan Payment",
                     "loan_term": "Length of Loan (months)",
                     "loan_default": "Did Loan Default?"
                 },
                title="Interactive Distribution of <br>Loan Payment to Loan Default Status")

fig.write_html('temp_plotly2.html')
htmltools::includeHTML("temp_plotly2.html")